library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(purrr)
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(tools)
prevalence_df = read_csv("data/PLACES__Local_Data_for_Better_Health__County_Data_2023_release.csv") |>
janitor::clean_names() |>
select(year, state_abbr, category, measure, data_value, total_population, category_id, measure_id, short_question_text) |>
filter(year == 2021)
## Rows: 228770 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): StateAbbr, StateDesc, LocationName, DataSource, Category, Measure,...
## dbl (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
prevalence_outcome = prevalence_df |>
filter(category == "Health Outcomes")
prevalence_chol = prevalence_outcome |>
filter(measure_id == "HIGHCHOL") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(chol_pre_2021 = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevalence_risk = prevalence_df |>
filter(category == "Health Risk Behaviors")
risk_lpa = prevalence_risk |>
filter(measure_id == "LPA") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(lpa_pre = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
risk_binge = prevalence_risk |>
filter(measure_id == "BINGE") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(binge_pre = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
risk_smoking = prevalence_risk |>
filter(measure_id == "CSMOKING") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(smoking_pre = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevalence_prevention = prevalence_df |>
filter(category == "Prevention")
prevention_insurance = prevalence_prevention |>
filter(measure_id == "ACCESS2") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(insurance_pre = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevention_cholscreen = prevalence_prevention |>
filter(measure_id == "CHOLSCREEN") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(cholscreen_pre = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
prevalence_df_2020 = read_csv("data/PLACES__Local_Data_for_Better_Health__County_Data_2023_release.csv") |>
janitor::clean_names() |>
select(year, state_abbr, category, measure, data_value, total_population, category_id, measure_id, short_question_text) |>
filter(year == 2020)
## Rows: 228770 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): StateAbbr, StateDesc, LocationName, DataSource, Category, Measure,...
## dbl (5): Year, Data_Value, Low_Confidence_Limit, High_Confidence_Limit, Tot...
## lgl (2): Data_Value_Footnote_Symbol, Data_Value_Footnote
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
risk_sleep_2020 = prevalence_df_2020 |>
filter(category == "Health Risk Behaviors") |>
filter(measure_id == "SLEEP") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(sleep_pre = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
#anti_join(risk_sleep,risk_binge, by = 'state_abbr')
#FL in sleep but not in chol & other dfs
prevalence_chol_2020 = prevalence_df |>
filter(category == "Health Outcomes") |>
filter(measure_id == "HIGHCHOL") |>
select(state_abbr, data_value, total_population) |>
mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
group_by(state_abbr) |>
summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
distinct() |>
mutate(chol_pre_2020 = disease_popu / total_popu) |>
select(-disease_popu)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'state_abbr'. You can override using the
## `.groups` argument.
merged_2021 = Reduce(function(df1, df2) left_join(df1, df2, by = c("state_abbr", "total_popu")), list(prevalence_chol, risk_binge, risk_lpa, risk_smoking, prevention_insurance, prevention_cholscreen))
merged_2020 = left_join(prevalence_chol_2020, risk_sleep_2020)
## Joining with `by = join_by(state_abbr, total_popu)`
merged_total = left_join(merged_2021, merged_2020)
## Joining with `by = join_by(state_abbr, total_popu)`
Here is the prevalence plot!
x_range = range(c(pull(merged_total, chol_pre_2021), pull(merged_total, chol_pre_2020)))
y_range = range(c(pull(merged_total, binge_pre), pull(merged_total, lpa_pre), pull(merged_total, smoking_pre), pull(merged_total, insurance_pre), pull(merged_total, cholscreen_pre), pull(merged_total, sleep_pre)))
prevalence_plot =
merged_total |>
plot_ly() |>
add_trace(x = ~chol_pre_2021, y = ~binge_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Binge Drinking', showlegend = TRUE, visible = TRUE) |>
add_trace(x = ~chol_pre_2021, y = ~lpa_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'No Leisure-time Physical Activity', showlegend = TRUE, visible = TRUE) |>
add_trace(x = ~chol_pre_2021, y = ~smoking_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Smoking Currently', showlegend = TRUE, visible = TRUE) |>
add_trace(x = ~chol_pre_2021, y = ~insurance_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Lack of Health Insurance Currently', showlegend = TRUE, visible = TRUE) |>
add_trace(x = ~chol_pre_2021, y = ~cholscreen_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Cholesterol Screening', showlegend = TRUE, visible = TRUE) |>
add_trace(x = ~chol_pre_2021, y = ~sleep_pre, type = 'scatter', mode = 'markers', opacity = 0.75, marker = list(size = 9), name = 'Sleep Less Than 7 Hours', showlegend = TRUE, visible = TRUE) |>
layout(title = 'High Cholesterol v.s. Health-Affecting Behaviors',
xaxis = list(title = 'Prevalence of High Cholesterol', range = x_range + c(-0.01, 0.01), tickformat = '.0%', showline = TRUE),
yaxis = list(title = 'Prevalence of Health-Affecting Behavior', range = y_range + c(-0.05, 0.05), tickformat = '.0%', showline = TRUE))
prevalence_plot
Find top health behaviors that has largest correlations with high
cholesterol.
cor_table = data.frame(behavior = character(), correlation = numeric(), stringsAsFactors = FALSE)
behavs = colnames(merged_2021[4:8])
for (each in behavs) {
new_row = data.frame(behavior = substr(each, 1, nchar(each) - 4), correlation = cor(pull(merged_2021, chol_pre_2021), pull(merged_2021, each)))
cor_table = rbind(cor_table, new_row)
}
sleep_row = data.frame(behavior = 'sleep', correlation = cor(pull(merged_2020, chol_pre_2020), pull(merged_2020, sleep_pre)))
cor_table = rbind(cor_table, sleep_row)
cor_table = arrange(cor_table, desc(correlation))
cor_table = cor_table |>
mutate(behavior = case_match(behavior,
"lpa" ~ "No Leisure-time Physical Activity",
"sleep" ~ "Sleep Less Than 7 Hours",
"smoking" ~ "Smoking Currently",
"insurance" ~ "Lack of Health Insurance Currently",
"cholscreen" ~ "Cholesterol Screening",
"binge" ~ "Binge Drinking"
),
behavior = factor(behavior, levels = behavior)) |>
rename("Behavior" = behavior,
"Correlation" = correlation)
knitr::kable(cor_table)
| No Leisure-time Physical Activity |
0.6960863 |
| Sleep Less Than 7 Hours |
0.6695707 |
| Smoking Currently |
0.4931834 |
| Lack of Health Insurance Currently |
0.3606138 |
| Cholesterol Screening |
0.2124629 |
| Binge Drinking |
-0.5014168 |
Use a bar plot to visulize it.
cor_bar = plot_ly(cor_table, x = ~Behavior, y = ~Correlation, type = 'bar', opacity = 0.8, marker = list(color = "MidnightBlue")) |>
layout(title = 'Correlation Between High Cholesterol and Health Behaviors',
xaxis = list(title = 'Behavior', tickangle=-30),
yaxis = list(title = 'Correlation'))
cor_bar